{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2006, 2)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/1111stop.txt', encoding='utf8') as file:\n",
    "    stopWord_list = [k.strip() for k in file.readlines()]\n",
    "\n",
    "train_df = pd.read_csv('C:/Users/win8/PycharmProjects/textmining/venv/train.txt',sep='\\t', header=None,)\n",
    "train_df = train_df.astype(str)\n",
    "train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "前1000篇文章分词共花费53.45秒\n",
      "前2000篇文章分词共花费89.40秒\n"
     ]
    }
   ],
   "source": [
    "import jieba\n",
    "import time\n",
    "\n",
    "train_df.columns = ['分类', '文章']\n",
    "stopword_list = [k.strip() for k in open('C:/Users/win8/Desktop/1111stop.txt', encoding='utf8').readlines() if k.strip() != '']\n",
    "cutWords_list = []\n",
    "i = 0\n",
    "startTime = time.time()\n",
    "for article in train_df['文章']:\n",
    "    cutWords = [k for k in jieba.cut(article) if k not in stopword_list]\n",
    "    i += 1\n",
    "    if i % 1000 == 0:\n",
    "        print('前%d篇文章分词共花费%.2f秒' %(i, time.time()-startTime))\n",
    "    cutWords_list.append(cutWords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/cutWords_list.txt', 'w',encoding='utf-8') as file: \n",
    "    for cutWords in cutWords_list:\n",
    "        file.write(' '.join(cutWords) + '\\n')\n",
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/cutWords_list.txt',encoding='utf-8') as file:\n",
    "    cutWords_list = [k.split() for k in file.readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "tfidf = TfidfVectorizer(cutWords_list, min_df=20, max_df=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2006, 6127)\n"
     ]
    }
   ],
   "source": [
    "X = tfidf.fit_transform(train_df['文章'])\n",
    "print(X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2006,)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "import pandas as pd\n",
    "\n",
    "train_df = pd.read_csv('C:/Users/win8/PycharmProjects/textmining/venv/train.txt', sep='\\t', header=None)\n",
    "train_df = train_df.astype(str)\n",
    "labelEncoder = LabelEncoder()\n",
    "y = labelEncoder.fit_transform(train_df[0].values)\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_df[0].value_counts()\n",
    "import time  \n",
    "from sklearn import metrics  \n",
    "import pickle as pickle  \n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9626865671641791"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#逻辑回归模型 LR\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs')\n",
    "logistic_model.fit(train_X, train_y)\n",
    "logistic_model.score(test_X, test_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存模型的代码块\n",
    "import pickle\n",
    "\n",
    "with open('C:/Users/win8/PycharmProjects/textmining/venv/lr_40_0.3.model', 'rb') as file:\n",
    "    tfidf_model = pickle.load(file)\n",
    "    tfidfVectorizer = tfidf_model['tfidfVectorizer']\n",
    "    labelEncoder = tfidf_model['labelEncoder']\n",
    "    logistic_model = tfidf_model['logistic_model']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "train_df = pd.read_csv('C:/Users/win8/PycharmProjects/textmining/venv/train.txt', sep='\\t', header=None)\n",
    "train_df = train_df.astype(str)\n",
    "X = tfidfVectorizer.transform(train_df[1])\n",
    "y = labelEncoder.transform(train_df[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.96179402 0.95016611 0.94352159 0.9551495  0.94684385]\n",
      "0.9514950166112957\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import ShuffleSplit\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs')\n",
    "cv_split = ShuffleSplit(n_splits=5, test_size=0.3)\n",
    "score_ndarray = cross_val_score(logistic_model, X, y, cv=cv_split)\n",
    "print(score_ndarray)\n",
    "print(score_ndarray.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>互联网</th>\n",
       "      <th>体育</th>\n",
       "      <th>军事</th>\n",
       "      <th>娱乐</th>\n",
       "      <th>电商</th>\n",
       "      <th>财经</th>\n",
       "      <th>资讯</th>\n",
       "      <th>零售</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>互联网</th>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>体育</th>\n",
       "      <td>0</td>\n",
       "      <td>45</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>军事</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>43</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>娱乐</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>电商</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>38</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>财经</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>51</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>资讯</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>54</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>零售</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     互联网  体育  军事  娱乐  电商  财经  资讯  零售\n",
       "互联网   51   0   0   0   0   0   0   0\n",
       "体育     0  45   0   0   0   0   0   0\n",
       "军事     0   0  43   0   0   0   0   0\n",
       "娱乐     0   0   0  51   0   0   0   0\n",
       "电商     1   0   0   1  38   0   1   0\n",
       "财经     0   0   1   1   2  51   1   0\n",
       "资讯     0   0   3   0   0   0  54   0\n",
       "零售     0   0   0   0   1   2   1  54"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import confusion_matrix\n",
    "import pandas as pd\n",
    "\n",
    "train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)\n",
    "logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs')\n",
    "logistic_model.fit(train_X, train_y)\n",
    "predict_y = logistic_model.predict(test_X)\n",
    "pd.DataFrame(confusion_matrix(test_y, predict_y), \n",
    "             columns=labelEncoder.classes_, \n",
    "             index=labelEncoder.classes_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.98      1.00      0.99        51\n",
      "           1       1.00      1.00      1.00        45\n",
      "           2       0.91      1.00      0.96        43\n",
      "           3       0.96      1.00      0.98        51\n",
      "           4       0.93      0.93      0.93        41\n",
      "           5       0.96      0.91      0.94        56\n",
      "           6       0.95      0.95      0.95        57\n",
      "           7       1.00      0.93      0.96        58\n",
      "\n",
      "   micro avg       0.96      0.96      0.96       402\n",
      "   macro avg       0.96      0.96      0.96       402\n",
      "weighted avg       0.96      0.96      0.96       402\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(test_y, predict_y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.99      1.00      1.00       252\n",
      "           1       1.00      1.00      1.00       250\n",
      "           2       0.97      1.00      0.98       250\n",
      "           3       0.99      1.00      1.00       252\n",
      "           4       0.98      0.98      0.98       250\n",
      "           5       0.99      0.96      0.97       251\n",
      "           6       0.98      0.98      0.98       250\n",
      "           7       0.99      0.98      0.99       251\n",
      "\n",
      "   micro avg       0.99      0.99      0.99      2006\n",
      "   macro avg       0.99      0.99      0.99      2006\n",
      "weighted avg       0.99      0.99      0.99      2006\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "test_df = pd.read_csv('C:/Users/win8/PycharmProjects/textmining/venv/train.txt', sep='\\t', header=None)\n",
    "test_X = tfidfVectorizer.transform(test_df[1])\n",
    "test_y = labelEncoder.transform(test_df[0])\n",
    "predict_y = logistic_model.predict(test_X)\n",
    "print(classification_report(test_y, predict_y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
